library(ggplot2)
library(dplyr)
library(ggdist)
library(caret)
library(tidyverse)
library(plotly)
library(networkD3)
library(scales)
library(reshape)
library(klaR)
library(countrycode)
library(ISLR)
library(rpart)
library(rpart.plot)
set.seed(999)
df <- read.csv("salaries.csv", stringsAsFactors = TRUE)
summary(df)
## work_year experience_level employment_type
## Min. :2020 EN:147 CT: 8
## 1st Qu.:2022 EX: 45 FL: 5
## Median :2022 MI:337 FT:1306
## Mean :2022 SE:803 PT: 13
## 3rd Qu.:2022
## Max. :2022
##
## job_title salary salary_currency
## Data Scientist :344 Min. : 2324 USD :1019
## Data Engineer :320 1st Qu.: 80000 EUR : 134
## Data Analyst :190 Median : 130000 GBP : 81
## Machine Learning Engineer: 86 Mean : 237712 INR : 40
## Analytics Engineer : 43 3rd Qu.: 175100 CAD : 21
## Data Architect : 37 Max. :30400000 AUD : 8
## (Other) :312 (Other): 29
## salary_in_usd employee_residence remote_ratio company_location
## Min. : 2324 US :910 Min. : 0.00 US :939
## 1st Qu.: 75593 GB : 83 1st Qu.: 0.00 GB : 87
## Median :120000 IN : 45 Median :100.00 CA : 39
## Mean :123375 CA : 38 Mean : 63.85 IN : 36
## 3rd Qu.:164997 DE : 31 3rd Qu.:100.00 DE : 35
## Max. :600000 ES : 29 Max. :100.00 ES : 28
## (Other):196 (Other):168
## company_size
## L:322
## M:898
## S:112
##
##
##
##
str(df)
## 'data.frame': 1332 obs. of 11 variables:
## $ work_year : int 2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
## $ experience_level : Factor w/ 4 levels "EN","EX","MI",..: 3 3 3 3 3 3 4 4 4 4 ...
## $ employment_type : Factor w/ 4 levels "CT","FL","FT",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ job_title : Factor w/ 64 levels "3D Computer Vision Researcher",..: 47 47 31 31 31 31 22 22 31 31 ...
## $ salary : int 130000 90000 120000 100000 85000 78000 161000 110000 136000 104000 ...
## $ salary_currency : Factor w/ 18 levels "AUD","BRL","CAD",..: 18 18 18 18 18 18 18 18 18 18 ...
## $ salary_in_usd : int 130000 90000 120000 100000 85000 78000 161000 110000 136000 104000 ...
## $ employee_residence: Factor w/ 64 levels "AE","AR","AT",..: 63 63 63 63 63 63 63 63 63 63 ...
## $ remote_ratio : int 0 0 100 100 100 100 100 100 100 100 ...
## $ company_location : Factor w/ 59 levels "AE","AL","AR",..: 58 58 58 58 58 58 58 58 58 58 ...
## $ company_size : Factor w/ 3 levels "L","M","S": 2 2 2 2 2 2 2 2 2 2 ...
anyNA(df)
## [1] FALSE
df$salary <- NULL
df$remote_ratio <- as.factor(df$remote_ratio)
levels(df$remote_ratio) <- list("Office" = 0, "Hybrid" = 50, "Remote" = 100)
df$salary_group <- cut(df$salary_in_usd, c(0, 25000, 50000, 75000, 100000, 125000, 150000, 200000, 250000, 1000000))
df$salary_group <- as.factor(df$salary_group)
levels(df$salary_group) <- c("25k", "50k", "75k", "100k", "125k", "150k", "200k", "250k", "1M")
# group df$employee_residence to different regions in the world. The data is in ISO 3166-1 alpha-2 format. Regions like North America, Europe, Asia, etc.
northAmerica <-c("US", "CA", "MX")
southAmerica <- c("AR", "BO", "BR", "CL", "CO")
centralAmerica <- c("CR", "DO", "HN", "PR")
europe <- c("AL", "AT", "AX", "BE", "BG", "CH", "CZ", "DE", "DK", "EE", "ES", "FI", "FR", "GB", "GR", "HR", "HU", "IE", "IS", "IT", "JE", "LU", "MD", "MT", "NL", "PL", "PT", "RO", "RS", "RU", "SI", "TR", "UA")
asia <- c("AE", "CN", "HK", "ID", "IL", "IN", "IQ", "IR", "JP", "MY", "PH", "PK", "QA", "SG", "TH", "VN")
africa <- c("DZ", "EG", "KE", "NG", "TN", "TZ")
oceania <- c("AU", "NZ", "AS")
df$employee_region <- as.factor(df$employee_residence)
levels(df$employee_region) <- list(
"North America" = northAmerica,
"South America" = southAmerica,
"Central America" = centralAmerica,
"Europe" = europe,
"Asia" = asia,
"Africa" = africa,
"Oceania" = oceania)
df$company_region <- as.factor(df$company_location)
levels(df$company_region) <- list(
"North America" = northAmerica,
"South America" = southAmerica,
"Central America" = centralAmerica,
"Europe" = europe,
"Asia" = asia,
"Africa" = africa,
"Oceania" = oceania)
aux <- data.frame(matrix(0, nrow = length(levels(df$company_region)), ncol = length(levels(df$company_region))))
colnames(aux) <- levels(df$company_region)
rownames(aux) <- levels(df$company_region)
for (i in 1:nrow(aux)) {
for (j in 1:ncol(aux)) {
aux[i, j] <- length(which(df$company_region == colnames(aux)[j] & df$employee_region == rownames(aux)[i]))
}
}
data_long <- aux %>%
rownames_to_column %>%
gather(key = 'key', value = 'value', -rowname) %>%
filter(value > 0)
colnames(data_long) <- c("source", "target", "value")
data_long$target <- paste(data_long$target, " ", sep="")
nodes <- data.frame(name=c(as.character(data_long$source), as.character(data_long$target)) %>% unique())
data_long$IDsource=match(data_long$source, nodes$name)-1
data_long$IDtarget=match(data_long$target, nodes$name)-1
sankeyNetwork(Links = data_long, Nodes = nodes,
Source = "IDsource", Target = "IDtarget",
Value = "value", NodeID = "name",
sinksRight=FALSE, nodeWidth=30, fontSize=14, nodePadding=15)
levels(factor(df$job_title))
## [1] "3D Computer Vision Researcher"
## [2] "AI Scientist"
## [3] "Analytics Engineer"
## [4] "Applied Data Scientist"
## [5] "Applied Machine Learning Scientist"
## [6] "Applied Scientist"
## [7] "BI Analyst"
## [8] "BI Data Analyst"
## [9] "Big Data Architect"
## [10] "Big Data Engineer"
## [11] "Business Data Analyst"
## [12] "Cloud Data Architect"
## [13] "Cloud Data Engineer"
## [14] "Computer Vision Engineer"
## [15] "Computer Vision Software Engineer"
## [16] "Data Analyst"
## [17] "Data Analytics Consultant"
## [18] "Data Analytics Engineer"
## [19] "Data Analytics Lead"
## [20] "Data Analytics Manager"
## [21] "Data Architect"
## [22] "Data Engineer"
## [23] "Data Engineering Manager"
## [24] "Data Manager"
## [25] "Data Operations Analyst"
## [26] "Data Operations Engineer"
## [27] "Data Science Consultant"
## [28] "Data Science Engineer"
## [29] "Data Science Lead"
## [30] "Data Science Manager"
## [31] "Data Scientist"
## [32] "Data Scientist Lead"
## [33] "Data Specialist"
## [34] "Director of Data Engineering"
## [35] "Director of Data Science"
## [36] "ETL Developer"
## [37] "Finance Data Analyst"
## [38] "Financial Data Analyst"
## [39] "Head of Data"
## [40] "Head of Data Science"
## [41] "Head of Machine Learning"
## [42] "Lead Data Analyst"
## [43] "Lead Data Engineer"
## [44] "Lead Data Scientist"
## [45] "Lead Machine Learning Engineer"
## [46] "Machine Learning Developer"
## [47] "Machine Learning Engineer"
## [48] "Machine Learning Infrastructure Engineer"
## [49] "Machine Learning Manager"
## [50] "Machine Learning Research Engineer"
## [51] "Machine Learning Scientist"
## [52] "Marketing Data Analyst"
## [53] "ML Engineer"
## [54] "NLP Engineer"
## [55] "Power BI Developer"
## [56] "Principal Data Analyst"
## [57] "Principal Data Architect"
## [58] "Principal Data Engineer"
## [59] "Principal Data Scientist"
## [60] "Product Data Analyst"
## [61] "Product Data Scientist"
## [62] "Research Engineer"
## [63] "Research Scientist"
## [64] "Staff Data Scientist"
df$industry <- ifelse( grepl("data", df$job_title, ignore.case = T), "DATA", "OTHER")
df$industry <- ifelse( grepl("machine", df$job_title, ignore.case = T), "ML/AI", df$industry)
df$industry <- ifelse( grepl("ML", df$job_title, ignore.case = T), "ML/AI", df$industry)
df$industry <- ifelse( grepl("AI", df$job_title, ignore.case = T), "ML/AI", df$industry)
df$industry <- as.factor(df$industry)
df$boss <- ifelse(grepl("head", df$job_title, ignore.case = T), TRUE, FALSE)
df$boss <- ifelse(grepl("lead", df$job_title, ignore.case = T), TRUE, df$boss)
df$boss <- ifelse(grepl("principal", df$job_title, ignore.case = T), TRUE, df$boss)
df$boss <- ifelse(grepl("director", df$job_title, ignore.case = T), TRUE, df$boss)
df$role <- ifelse(grepl("scientist", df$job_title, ignore.case = T), "SCIENTIST", "OTHER")
df$role <- ifelse(grepl("engineer", df$job_title, ignore.case = T), "ENGINEER", df$role)
df$role <- ifelse(grepl("analyst", df$job_title, ignore.case = T), "ANALYST", df$role)
df$role <- ifelse(grepl("manager", df$job_title, ignore.case = T), "MANAGER", df$role)
df$role <- ifelse(grepl("architect", df$job_title, ignore.case = T), "ARCHITECT", df$role)
df$role <- ifelse(grepl("developer", df$job_title, ignore.case = T), "DEVELOPER", df$role)
df$role <- as.factor(df$role)
df$research <- ifelse(grepl("research", df$job_title, ignore.case = T), TRUE, FALSE)
df$job_title[df$job_title == "ML Engineer"] <- "Machine Learning Engineer"
summary(df)
## work_year experience_level employment_type
## Min. :2020 EN:147 CT: 8
## 1st Qu.:2022 EX: 45 FL: 5
## Median :2022 MI:337 FT:1306
## Mean :2022 SE:803 PT: 13
## 3rd Qu.:2022
## Max. :2022
##
## job_title salary_currency salary_in_usd
## Data Scientist :344 USD :1019 Min. : 2324
## Data Engineer :320 EUR : 134 1st Qu.: 75593
## Data Analyst :190 GBP : 81 Median :120000
## Machine Learning Engineer:102 INR : 40 Mean :123375
## Analytics Engineer : 43 CAD : 21 3rd Qu.:164997
## Data Architect : 37 AUD : 8 Max. :600000
## (Other) :296 (Other): 29
## employee_residence remote_ratio company_location company_size salary_group
## US :910 Office:412 US :939 L:322 200k :283
## GB : 83 Hybrid:139 GB : 87 M:898 150k :231
## IN : 45 Remote:781 CA : 39 S:112 100k :201
## CA : 38 IN : 36 75k :155
## DE : 31 DE : 35 125k :155
## ES : 29 ES : 28 250k :104
## (Other):196 (Other):168 (Other):203
## employee_region company_region industry boss
## North America :951 North America :982 DATA :1076 Mode :logical
## South America : 26 South America : 18 ML/AI: 157 FALSE:1276
## Central America: 8 Central America: 5 OTHER: 99 TRUE :56
## Europe :253 Europe :247
## Asia : 77 Asia : 62
## Africa : 7 Africa : 6
## Oceania : 10 Oceania : 12
## role research
## ANALYST :232 Mode :logical
## ARCHITECT: 40 FALSE:1306
## DEVELOPER: 16 TRUE :26
## ENGINEER :527
## MANAGER : 50
## OTHER : 41
## SCIENTIST:426
mean(df$salary_in_usd)
## [1] 123374.7
mean(df[df$boss == FALSE, 'salary_in_usd'])
## [1] 121704.5
mean(df[df$boss == TRUE, 'salary_in_usd'])
## [1] 161430.6
numerics_names <- colnames(df[, sapply(df, is.numeric)])
factors_names <- colnames(df[, sapply(df, is.factor)])
for (i in 1:length(numerics_names)){
print(ggplot(df, aes_string(x= numerics_names[i])) +
ggdist::stat_halfeye(
adjust=0.5,
justification = -.2,
.width=0
) +
geom_boxplot(
width = .1,
alpha = 0.5
) +
labs(title=paste("Distribution and boxplot of",numerics_names[i]), y="count"))
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation ideoms with `aes()`
## Warning: Using the `size` aesthietic with geom_segment was deprecated in ggplot2 3.4.0.
## ℹ Please use the `linewidth` aesthetic instead.
for (i in 1:length(factors_names)){
print(ggplot(df, aes_string(x= factors_names[i])) +
geom_bar()+
labs(title=paste("Histogram of",factors_names[i]),y="count"))
}
plot(df$employee_residence, df$salary_in_usd, xlab="Employee Residence", ylab="Salary in USD")
plot(df$company_size, df$salary_in_usd, xlab="Company Size", ylab="Salary in USD")
plot(df$company_location, df$salary_in_usd, xlab="Company Location", ylab="Salary in USD")
plot(df$experience_level, df$salary_in_usd, xlab="Experience Level", ylab="Salary in USD")
plot(df$remote_ratio, df$salary_in_usd, xlab="Remote Ratio", ylab="Salary in USD")
y_2020 <- df[df$work_year == 2020,]
y_2021 <- df[df$work_year == 2021,]
y_2022 <- df[df$work_year == 2022,]
y_2020 <- y_2020 %>% group_by(remote_ratio) %>% summarise(count = n())
y_2021 <- y_2021 %>% group_by(remote_ratio) %>% summarise(count = n())
y_2022 <- y_2022 %>% group_by(remote_ratio) %>% summarise(count = n())
y_2020$percentage <- y_2020$count / sum(y_2020$count) * 100
y_2021$percentage <- y_2021$count / sum(y_2021$count) * 100
y_2022$percentage <- y_2022$count / sum(y_2022$count) * 100
y_2020$work_year <- 2020
y_2021$work_year <- 2021
y_2022$work_year <- 2022
y <- rbind(y_2020, y_2021, y_2022)
ggplot(y, aes(x = work_year, y = percentage, group = remote_ratio, color = remote_ratio)) +
geom_line()+
geom_point() +
labs(x = "Work Year", y = "Percentage of Remote Ratio", title = "Remote Ratio by Work Year") +
theme(plot.title = element_text(hjust = 0.5), axis.text=element_text(size=12),
axis.title=element_text(size=12)) +
scale_x_continuous(breaks = c(2020, 2021, 2022))
#First we remove some columns that wont bring useful information to the model:
dfTreeRegression <- df
dfTreeRegression$salary_currency <- NULL
#We are going to try to predict salary with a regression tree. We are going to eliminate the created "salary_group" column because otherwise it would be kinda cheating since its the value that we are trying to predict.
dfTreeRegression$salary_group <- NULL
#After making a couple of tests, we saw that these variables are never used by the tree to make predictions, so we are going to remove them from the set so getting the best options for our profiles is easier later:
dfTreeRegression$company_region <- NULL
dfTreeRegression$employee_region <- NULL
dfTreeRegression$work_year <- NULL
dfTreeRegression$research <- NULL
dfTreeRegression$employment_type <- NULL
dfTreeRegression$role <- NULL
dfTreeRegression$boss <- NULL
set.seed(42)
sample <- sample(c(TRUE, FALSE), nrow(dfTreeRegression), replace=TRUE, prob=c(0.8,0.2))
Tree_train <- dfTreeRegression[sample, ]
Tree_test <- dfTreeRegression[!sample, ]
#Con el parametro "cp", cuanto mas proximo a 0, menos poda hace.
fit <- rpart(salary_in_usd~., data = Tree_train, method="anova", control = list(cp = 0.0001))
rules <- rpart.rules(fit)
test_pred <- predict(fit, Tree_test, method="anova")
results <- data.frame(test_pred, Tree_test$salary_in_usd)
results$residuals <- results$Tree_test.salary_in_usd-results$test_pred
plot(results$residuals)
abline(0,0, col="red")
resmean <- mean(results$residuals)
abline(resmean,0, col="blue")
sd(results$residuals)
## [1] 57435.01
nrow(rules)
## [1] 51
datos_monica <- Tree_test[1,]
datos_monica[1,] = list("MI", "Data Scientist",0, "ES", "Hybrid", "ES", "S", "DATA")
options(width=1000)
salarios = rpart.predict(fit, datos_monica)
salarios
## 1
## 40856.43
wallis_grid <- expand.grid(
experience_level = c("EN"),
job_title = c("ML Engineer"),
salary_in_usd = 0,
employee_residence = c("US"),
remote_ratio = levels(df$remote_ratio),
company_location = c("US"),
company_size = levels(df$company_size),
industry = c("ML/AI")
)
wallis_grid
## experience_level job_title salary_in_usd employee_residence remote_ratio company_location company_size industry
## 1 EN ML Engineer 0 US Office US L ML/AI
## 2 EN ML Engineer 0 US Hybrid US L ML/AI
## 3 EN ML Engineer 0 US Remote US L ML/AI
## 4 EN ML Engineer 0 US Office US M ML/AI
## 5 EN ML Engineer 0 US Hybrid US M ML/AI
## 6 EN ML Engineer 0 US Remote US M ML/AI
## 7 EN ML Engineer 0 US Office US S ML/AI
## 8 EN ML Engineer 0 US Hybrid US S ML/AI
## 9 EN ML Engineer 0 US Remote US S ML/AI
prediction <- rpart.predict(fit, wallis_grid)
wallis_max <- max(prediction)
wallis_best_combinations_index <- which(wallis_max == prediction)
wallis_best_combinations <- wallis_grid[wallis_best_combinations_index,]
wallis_best_combinations
## experience_level job_title salary_in_usd employee_residence remote_ratio company_location company_size industry
## 4 EN ML Engineer 0 US Office US M ML/AI
## 5 EN ML Engineer 0 US Hybrid US M ML/AI
## 6 EN ML Engineer 0 US Remote US M ML/AI
## 7 EN ML Engineer 0 US Office US S ML/AI
## 8 EN ML Engineer 0 US Hybrid US S ML/AI
## 9 EN ML Engineer 0 US Remote US S ML/AI
wallis_max
## [1] 102330.8
dawid_grid <- expand.grid(
experience_level = c("MI"),
job_title = c("ML Engineer"),
salary_in_usd = 0,
employee_residence = c("ES"),
remote_ratio = c("Remote"),
company_location = levels(df$company_location),
company_size = levels(df$company_size),
industry = c("ML/AI")
)
dawid_grid
## experience_level job_title salary_in_usd employee_residence remote_ratio company_location company_size industry
## 1 MI ML Engineer 0 ES Remote AE L ML/AI
## 2 MI ML Engineer 0 ES Remote AL L ML/AI
## 3 MI ML Engineer 0 ES Remote AR L ML/AI
## 4 MI ML Engineer 0 ES Remote AS L ML/AI
## 5 MI ML Engineer 0 ES Remote AT L ML/AI
## 6 MI ML Engineer 0 ES Remote AU L ML/AI
## 7 MI ML Engineer 0 ES Remote BE L ML/AI
## 8 MI ML Engineer 0 ES Remote BO L ML/AI
## 9 MI ML Engineer 0 ES Remote BR L ML/AI
## 10 MI ML Engineer 0 ES Remote CA L ML/AI
## 11 MI ML Engineer 0 ES Remote CH L ML/AI
## 12 MI ML Engineer 0 ES Remote CL L ML/AI
## 13 MI ML Engineer 0 ES Remote CN L ML/AI
## 14 MI ML Engineer 0 ES Remote CO L ML/AI
## 15 MI ML Engineer 0 ES Remote CZ L ML/AI
## 16 MI ML Engineer 0 ES Remote DE L ML/AI
## 17 MI ML Engineer 0 ES Remote DK L ML/AI
## 18 MI ML Engineer 0 ES Remote DZ L ML/AI
## 19 MI ML Engineer 0 ES Remote EE L ML/AI
## 20 MI ML Engineer 0 ES Remote EG L ML/AI
## 21 MI ML Engineer 0 ES Remote ES L ML/AI
## 22 MI ML Engineer 0 ES Remote FI L ML/AI
## 23 MI ML Engineer 0 ES Remote FR L ML/AI
## 24 MI ML Engineer 0 ES Remote GB L ML/AI
## 25 MI ML Engineer 0 ES Remote GR L ML/AI
## 26 MI ML Engineer 0 ES Remote HN L ML/AI
## 27 MI ML Engineer 0 ES Remote HR L ML/AI
## 28 MI ML Engineer 0 ES Remote HU L ML/AI
## 29 MI ML Engineer 0 ES Remote ID L ML/AI
## 30 MI ML Engineer 0 ES Remote IE L ML/AI
## 31 MI ML Engineer 0 ES Remote IL L ML/AI
## 32 MI ML Engineer 0 ES Remote IN L ML/AI
## 33 MI ML Engineer 0 ES Remote IQ L ML/AI
## 34 MI ML Engineer 0 ES Remote IR L ML/AI
## 35 MI ML Engineer 0 ES Remote IT L ML/AI
## 36 MI ML Engineer 0 ES Remote JP L ML/AI
## 37 MI ML Engineer 0 ES Remote KE L ML/AI
## 38 MI ML Engineer 0 ES Remote LU L ML/AI
## 39 MI ML Engineer 0 ES Remote MD L ML/AI
## 40 MI ML Engineer 0 ES Remote MT L ML/AI
## 41 MI ML Engineer 0 ES Remote MX L ML/AI
## 42 MI ML Engineer 0 ES Remote MY L ML/AI
## 43 MI ML Engineer 0 ES Remote NG L ML/AI
## 44 MI ML Engineer 0 ES Remote NL L ML/AI
## 45 MI ML Engineer 0 ES Remote NZ L ML/AI
## 46 MI ML Engineer 0 ES Remote PH L ML/AI
## 47 MI ML Engineer 0 ES Remote PK L ML/AI
## 48 MI ML Engineer 0 ES Remote PL L ML/AI
## 49 MI ML Engineer 0 ES Remote PR L ML/AI
## 50 MI ML Engineer 0 ES Remote PT L ML/AI
## 51 MI ML Engineer 0 ES Remote RO L ML/AI
## 52 MI ML Engineer 0 ES Remote RU L ML/AI
## 53 MI ML Engineer 0 ES Remote SG L ML/AI
## 54 MI ML Engineer 0 ES Remote SI L ML/AI
## 55 MI ML Engineer 0 ES Remote TH L ML/AI
## 56 MI ML Engineer 0 ES Remote TR L ML/AI
## 57 MI ML Engineer 0 ES Remote UA L ML/AI
## 58 MI ML Engineer 0 ES Remote US L ML/AI
## 59 MI ML Engineer 0 ES Remote VN L ML/AI
## 60 MI ML Engineer 0 ES Remote AE M ML/AI
## 61 MI ML Engineer 0 ES Remote AL M ML/AI
## 62 MI ML Engineer 0 ES Remote AR M ML/AI
## 63 MI ML Engineer 0 ES Remote AS M ML/AI
## 64 MI ML Engineer 0 ES Remote AT M ML/AI
## 65 MI ML Engineer 0 ES Remote AU M ML/AI
## 66 MI ML Engineer 0 ES Remote BE M ML/AI
## 67 MI ML Engineer 0 ES Remote BO M ML/AI
## 68 MI ML Engineer 0 ES Remote BR M ML/AI
## 69 MI ML Engineer 0 ES Remote CA M ML/AI
## 70 MI ML Engineer 0 ES Remote CH M ML/AI
## 71 MI ML Engineer 0 ES Remote CL M ML/AI
## 72 MI ML Engineer 0 ES Remote CN M ML/AI
## 73 MI ML Engineer 0 ES Remote CO M ML/AI
## 74 MI ML Engineer 0 ES Remote CZ M ML/AI
## 75 MI ML Engineer 0 ES Remote DE M ML/AI
## 76 MI ML Engineer 0 ES Remote DK M ML/AI
## 77 MI ML Engineer 0 ES Remote DZ M ML/AI
## 78 MI ML Engineer 0 ES Remote EE M ML/AI
## 79 MI ML Engineer 0 ES Remote EG M ML/AI
## 80 MI ML Engineer 0 ES Remote ES M ML/AI
## 81 MI ML Engineer 0 ES Remote FI M ML/AI
## 82 MI ML Engineer 0 ES Remote FR M ML/AI
## 83 MI ML Engineer 0 ES Remote GB M ML/AI
## 84 MI ML Engineer 0 ES Remote GR M ML/AI
## 85 MI ML Engineer 0 ES Remote HN M ML/AI
## 86 MI ML Engineer 0 ES Remote HR M ML/AI
## 87 MI ML Engineer 0 ES Remote HU M ML/AI
## 88 MI ML Engineer 0 ES Remote ID M ML/AI
## 89 MI ML Engineer 0 ES Remote IE M ML/AI
## 90 MI ML Engineer 0 ES Remote IL M ML/AI
## 91 MI ML Engineer 0 ES Remote IN M ML/AI
## 92 MI ML Engineer 0 ES Remote IQ M ML/AI
## 93 MI ML Engineer 0 ES Remote IR M ML/AI
## 94 MI ML Engineer 0 ES Remote IT M ML/AI
## 95 MI ML Engineer 0 ES Remote JP M ML/AI
## 96 MI ML Engineer 0 ES Remote KE M ML/AI
## 97 MI ML Engineer 0 ES Remote LU M ML/AI
## 98 MI ML Engineer 0 ES Remote MD M ML/AI
## 99 MI ML Engineer 0 ES Remote MT M ML/AI
## 100 MI ML Engineer 0 ES Remote MX M ML/AI
## 101 MI ML Engineer 0 ES Remote MY M ML/AI
## 102 MI ML Engineer 0 ES Remote NG M ML/AI
## 103 MI ML Engineer 0 ES Remote NL M ML/AI
## 104 MI ML Engineer 0 ES Remote NZ M ML/AI
## 105 MI ML Engineer 0 ES Remote PH M ML/AI
## 106 MI ML Engineer 0 ES Remote PK M ML/AI
## 107 MI ML Engineer 0 ES Remote PL M ML/AI
## 108 MI ML Engineer 0 ES Remote PR M ML/AI
## 109 MI ML Engineer 0 ES Remote PT M ML/AI
## 110 MI ML Engineer 0 ES Remote RO M ML/AI
## 111 MI ML Engineer 0 ES Remote RU M ML/AI
## 112 MI ML Engineer 0 ES Remote SG M ML/AI
## 113 MI ML Engineer 0 ES Remote SI M ML/AI
## 114 MI ML Engineer 0 ES Remote TH M ML/AI
## 115 MI ML Engineer 0 ES Remote TR M ML/AI
## 116 MI ML Engineer 0 ES Remote UA M ML/AI
## 117 MI ML Engineer 0 ES Remote US M ML/AI
## 118 MI ML Engineer 0 ES Remote VN M ML/AI
## 119 MI ML Engineer 0 ES Remote AE S ML/AI
## 120 MI ML Engineer 0 ES Remote AL S ML/AI
## 121 MI ML Engineer 0 ES Remote AR S ML/AI
## 122 MI ML Engineer 0 ES Remote AS S ML/AI
## 123 MI ML Engineer 0 ES Remote AT S ML/AI
## 124 MI ML Engineer 0 ES Remote AU S ML/AI
## 125 MI ML Engineer 0 ES Remote BE S ML/AI
## 126 MI ML Engineer 0 ES Remote BO S ML/AI
## 127 MI ML Engineer 0 ES Remote BR S ML/AI
## 128 MI ML Engineer 0 ES Remote CA S ML/AI
## 129 MI ML Engineer 0 ES Remote CH S ML/AI
## 130 MI ML Engineer 0 ES Remote CL S ML/AI
## 131 MI ML Engineer 0 ES Remote CN S ML/AI
## 132 MI ML Engineer 0 ES Remote CO S ML/AI
## 133 MI ML Engineer 0 ES Remote CZ S ML/AI
## 134 MI ML Engineer 0 ES Remote DE S ML/AI
## 135 MI ML Engineer 0 ES Remote DK S ML/AI
## 136 MI ML Engineer 0 ES Remote DZ S ML/AI
## 137 MI ML Engineer 0 ES Remote EE S ML/AI
## 138 MI ML Engineer 0 ES Remote EG S ML/AI
## 139 MI ML Engineer 0 ES Remote ES S ML/AI
## 140 MI ML Engineer 0 ES Remote FI S ML/AI
## 141 MI ML Engineer 0 ES Remote FR S ML/AI
## 142 MI ML Engineer 0 ES Remote GB S ML/AI
## 143 MI ML Engineer 0 ES Remote GR S ML/AI
## 144 MI ML Engineer 0 ES Remote HN S ML/AI
## 145 MI ML Engineer 0 ES Remote HR S ML/AI
## 146 MI ML Engineer 0 ES Remote HU S ML/AI
## 147 MI ML Engineer 0 ES Remote ID S ML/AI
## 148 MI ML Engineer 0 ES Remote IE S ML/AI
## 149 MI ML Engineer 0 ES Remote IL S ML/AI
## 150 MI ML Engineer 0 ES Remote IN S ML/AI
## 151 MI ML Engineer 0 ES Remote IQ S ML/AI
## 152 MI ML Engineer 0 ES Remote IR S ML/AI
## 153 MI ML Engineer 0 ES Remote IT S ML/AI
## 154 MI ML Engineer 0 ES Remote JP S ML/AI
## 155 MI ML Engineer 0 ES Remote KE S ML/AI
## 156 MI ML Engineer 0 ES Remote LU S ML/AI
## 157 MI ML Engineer 0 ES Remote MD S ML/AI
## 158 MI ML Engineer 0 ES Remote MT S ML/AI
## 159 MI ML Engineer 0 ES Remote MX S ML/AI
## 160 MI ML Engineer 0 ES Remote MY S ML/AI
## 161 MI ML Engineer 0 ES Remote NG S ML/AI
## 162 MI ML Engineer 0 ES Remote NL S ML/AI
## 163 MI ML Engineer 0 ES Remote NZ S ML/AI
## 164 MI ML Engineer 0 ES Remote PH S ML/AI
## 165 MI ML Engineer 0 ES Remote PK S ML/AI
## 166 MI ML Engineer 0 ES Remote PL S ML/AI
## 167 MI ML Engineer 0 ES Remote PR S ML/AI
## 168 MI ML Engineer 0 ES Remote PT S ML/AI
## 169 MI ML Engineer 0 ES Remote RO S ML/AI
## 170 MI ML Engineer 0 ES Remote RU S ML/AI
## 171 MI ML Engineer 0 ES Remote SG S ML/AI
## 172 MI ML Engineer 0 ES Remote SI S ML/AI
## 173 MI ML Engineer 0 ES Remote TH S ML/AI
## 174 MI ML Engineer 0 ES Remote TR S ML/AI
## 175 MI ML Engineer 0 ES Remote UA S ML/AI
## 176 MI ML Engineer 0 ES Remote US S ML/AI
## 177 MI ML Engineer 0 ES Remote VN S ML/AI
prediction <- rpart.predict(fit, dawid_grid)
dawid_max <- max(prediction)
dawid_best_combinations_index <- which(dawid_max == prediction)
dawid_best_combinations <- dawid_grid[dawid_best_combinations_index,]
dawid_best_combinations
## experience_level job_title salary_in_usd employee_residence remote_ratio company_location company_size industry
## 1 MI ML Engineer 0 ES Remote AE L ML/AI
## 5 MI ML Engineer 0 ES Remote AT L ML/AI
## 7 MI ML Engineer 0 ES Remote BE L ML/AI
## 13 MI ML Engineer 0 ES Remote CN L ML/AI
## 18 MI ML Engineer 0 ES Remote DZ L ML/AI
## 22 MI ML Engineer 0 ES Remote FI L ML/AI
## 24 MI ML Engineer 0 ES Remote GB L ML/AI
## 33 MI ML Engineer 0 ES Remote IQ L ML/AI
## 36 MI ML Engineer 0 ES Remote JP L ML/AI
## 52 MI ML Engineer 0 ES Remote RU L ML/AI
## 54 MI ML Engineer 0 ES Remote SI L ML/AI
## 58 MI ML Engineer 0 ES Remote US L ML/AI
## 60 MI ML Engineer 0 ES Remote AE M ML/AI
## 64 MI ML Engineer 0 ES Remote AT M ML/AI
## 66 MI ML Engineer 0 ES Remote BE M ML/AI
## 72 MI ML Engineer 0 ES Remote CN M ML/AI
## 77 MI ML Engineer 0 ES Remote DZ M ML/AI
## 81 MI ML Engineer 0 ES Remote FI M ML/AI
## 83 MI ML Engineer 0 ES Remote GB M ML/AI
## 92 MI ML Engineer 0 ES Remote IQ M ML/AI
## 95 MI ML Engineer 0 ES Remote JP M ML/AI
## 111 MI ML Engineer 0 ES Remote RU M ML/AI
## 113 MI ML Engineer 0 ES Remote SI M ML/AI
## 117 MI ML Engineer 0 ES Remote US M ML/AI
## 119 MI ML Engineer 0 ES Remote AE S ML/AI
## 123 MI ML Engineer 0 ES Remote AT S ML/AI
## 125 MI ML Engineer 0 ES Remote BE S ML/AI
## 131 MI ML Engineer 0 ES Remote CN S ML/AI
## 136 MI ML Engineer 0 ES Remote DZ S ML/AI
## 140 MI ML Engineer 0 ES Remote FI S ML/AI
## 142 MI ML Engineer 0 ES Remote GB S ML/AI
## 151 MI ML Engineer 0 ES Remote IQ S ML/AI
## 154 MI ML Engineer 0 ES Remote JP S ML/AI
## 170 MI ML Engineer 0 ES Remote RU S ML/AI
## 172 MI ML Engineer 0 ES Remote SI S ML/AI
## 176 MI ML Engineer 0 ES Remote US S ML/AI
dawid_max
## [1] 109338.5
This could be useful for companies who want to retain their top talent and for employees that are underpaid. We could use different factors such to determine which employees are underpaid compared to their peers.
To answer this questions we will use two different techniques: Clustering and regression.
The following features will be used: - Experience level - Employment type - Industry - Employee region - Role - Remote_ratio - Boss - Research - Company size
First we create a separate dataframe for the variables we will use.
# Select the variables to use in the clustering analysis
df_vars <- dplyr::select(df, experience_level, employment_type, company_size, industry, employee_region, boss, research, role, remote_ratio)
#
# df_vars$experience_level <- as.numeric(df_vars$experience_level)
# df_vars$industry <- as.numeric(df_vars$industry)
# df_vars$employment_type <- as.numeric(df_vars$employment_type)
# df_vars$company_size <- as.numeric(df_vars$company_size)
# df_vars$employee_region <- as.numeric(df_vars$employee_region)
# df_vars$role <- as.numeric(df_vars$role)
# df_vars$boss <- as.numeric(df_vars$boss)
# df_vars$research <- as.numeric(df_vars$research)
# df_vars$remote_ratio <- as.numeric(df_vars$remote_ratio)
#
# # Standardize the variables
# df_vars <- scale(df_vars)
Let s find find the elbow in our plot, to find the “best” k for the kmodes algorithm
# results <- data.frame(k = integer(), WCSS = double())
#
# # Loop through a range of values for k
# for (k in 2:40) {
# # Run the k-means clustering algorithm
# km <- kmodes(df_vars, k, iter.max = 50)
# # Store the results in the data frame
# results <- rbind(results, data.frame(k = k, WCSS = sum(km$withindiff^2)))
# }
#
# # Plot the WCSS values for each value of k
# ggplot(results, aes(x = k, y = WCSS)) +
# geom_line() +
# geom_point() +
# labs(x = "Number of clusters (k)", y = "WCSS")
# Perform k-means clustering with n clusters
kmodes_results <- kmodes(df_vars, 20)
df_Q7 <- df
# Add the cluster labels to the original data
df_Q7$cluster <- kmodes_results$cluster
# Inspect the clusters
group_by(df_Q7, cluster) %>%
summarize(mean_salary = mean(salary_in_usd))
## # A tibble: 20 × 2
## cluster mean_salary
## <int> <dbl>
## 1 1 65656.
## 2 2 89992.
## 3 3 34249.
## 4 4 84749.
## 5 5 258496.
## 6 6 75476.
## 7 7 146911.
## 8 8 47324.
## 9 9 118519.
## 10 10 151188.
## 11 11 76042.
## 12 12 88378.
## 13 13 55665.
## 14 14 148238.
## 15 15 157399.
## 16 16 100367.
## 17 17 78672.
## 18 18 142089.
## 19 19 151811.
## 20 20 126383.
fig1 <- plot_ly(df_Q7, x = ~as.numeric(employee_region), y = ~as.numeric(industry), z = ~as.numeric(experience_level), color = ~cluster)
fig1 <- fig1 %>% add_markers()
axx <- list(ticketmode = 'array', title="Employee region", ticktext = levels(df$employee_region), tickvals = c(1,2,3,4,5,6,7), range = c(1,7), tickangle = 25)
axy <- list(ticketmode = 'array', title="Industry", ticktext = levels(df$industry), tickvals = c(1,2,3), range = c(1,3), tickangle = 45)
axz <- list(ticketmode = 'array', title="Experience level", ticktext = levels(df$experience_level), tickvals = c(1,2,3,4), range = c(1,4), tickangle = 45)
fig1 <- fig1 %>% layout(scene = list(xaxis = axx, yaxis = axy, zaxis = axz))
fig1
fig2 <- plot_ly(df_Q7, x = ~as.numeric(role), y = ~as.numeric(industry), z = ~as.numeric(remote_ratio), color = ~cluster)
fig2 <- fig2 %>% add_markers()
axx <- list(ticketmode = 'array', title="Role", ticktext = levels(df$role), tickvals = c(1,2,3,4,5,6,7), range = c(1,7), tickangle = 25)
axy <- list(ticketmode = 'array', title="Industry", ticktext = levels(df$industry), tickvals = c(1,2,3), range = c(1,3), tickangle = 45)
axz <- list(ticketmode = 'array', title="Remote ratio", ticktext = levels(df$remote_ratio), tickvals = c(1,2,3), range = c(1,3), tickangle = 45)
fig2 <- fig2 %>% layout(scene = list(xaxis = axx, yaxis = axy, zaxis = axz))
fig2
fig3 <- plot_ly(df_Q7, x = ~as.numeric(role), y = ~as.numeric(industry), z = ~as.numeric(remote_ratio), color = ~cluster)
fig3 <- fig3 %>% add_markers()
axx <- list(ticketmode = 'array', title="Role", ticktext = levels(df$role), tickvals = c(1,2,3,4,5,6,7), range = c(1,7), tickangle = 25)
axy <- list(ticketmode = 'array', title="Industry", ticktext = levels(df$industry), tickvals = c(1,2,3), range = c(1,3), tickangle = 45)
axz <- list(ticketmode = 'array', title="Remote ratio", ticktext = levels(df$remote_ratio), tickvals = c(1,2,3), range = c(1,3), tickangle = 45)
fig3 <- fig3 %>% layout(scene = list(xaxis = axx, yaxis = axy, zaxis = axz))
fig3
mean_salary <- tapply(df_Q7$salary_in_usd, df_Q7$cluster, mean)
# View the mean salary for each level of the experience_level variable
df_Q7$salary_difference_kmodes <- 0
df_Q7$predicted_salary_kmodes <-0
for (i in 1:nrow(df_Q7)) {
cluster_index = match(df_Q7$cluster[i], names(mean_salary))
df_Q7$salary_difference_kmodes[i] <- df_Q7$salary_in_usd[i] - mean_salary[cluster_index]
df_Q7$predicted_salary_kmodes[i] <- mean_salary[cluster_index]
}
underpaid_employees_kmodes <- df_Q7[df_Q7$salary_difference_kmodes < 0, ]
overpaid_employees_kmodes <- df_Q7[df_Q7$salary_difference_kmodes > 0, ]
underpaid_employees_kmodes <- underpaid_employees_kmodes[order(underpaid_employees_kmodes$salary_difference_kmodes), ]
overpaid_employees_kmodes <- overpaid_employees_kmodes[order(overpaid_employees_kmodes$salary_difference_kmodes, decreasing = TRUE), ]
# View the underpaid and overpaid employees
head(underpaid_employees_kmodes, n = 10)
## work_year experience_level employment_type job_title salary_currency salary_in_usd employee_residence remote_ratio company_location company_size salary_group employee_region company_region industry boss role research cluster salary_difference_kmodes predicted_salary_kmodes
## 1183 2021 MI FT Head of Data Science USD 110000 US Office US S 125k North America North America DATA TRUE OTHER FALSE 5 -148496.3 258496.3
## 113 2021 SE FT 3D Computer Vision Researcher USD 5000 US Remote US L 25k North America North America OTHER FALSE OTHER TRUE 10 -146187.8 151187.8
## 754 2022 EX FT Lead Data Engineer CAD 116073 CA Remote CA S 125k North America North America DATA TRUE ENGINEER FALSE 5 -142423.3 258496.3
## 1300 2021 SE FT Computer Vision Engineer BRL 18907 BR Office BR M 25k South America South America OTHER FALSE ENGINEER FALSE 19 -132904.5 151811.5
## 1045 2022 EN FT Data Analytics Engineer USD 20000 PK Office PK M 25k Asia Asia DATA FALSE ENGINEER FALSE 19 -131811.5 151811.5
## 612 2022 MI FT Data Engineer USD 24000 US Office US M 25k North America North America DATA FALSE ENGINEER FALSE 19 -127811.5 151811.5
## 613 2022 MI FT Data Engineer USD 24000 US Office US M 25k North America North America DATA FALSE ENGINEER FALSE 19 -127811.5 151811.5
## 958 2022 SE FT Data Engineer USD 25000 US Remote US M 25k North America North America DATA FALSE ENGINEER FALSE 19 -126811.5 151811.5
## 604 2022 SE FT Data Scientist BRL 2428 BR Remote BR L 25k South America South America DATA FALSE SCIENTIST FALSE 20 -123955.2 126383.2
## 1167 2020 SE FT Data Engineer MXN 33511 MX Office MX S 50k North America North America DATA FALSE ENGINEER FALSE 19 -118300.5 151811.5
head(overpaid_employees_kmodes, n=10)
## work_year experience_level employment_type job_title salary_currency salary_in_usd employee_residence remote_ratio company_location company_size salary_group employee_region company_region industry boss role research cluster salary_difference_kmodes predicted_salary_kmodes
## 1094 2020 MI FT Research Scientist USD 450000 US Office US M 1M North America North America OTHER FALSE SCIENTIST TRUE 2 360008.3 89991.67
## 1151 2021 MI FT Financial Data Analyst USD 450000 US Remote US L 1M North America North America DATA FALSE ANALYST FALSE 16 349633.1 100366.87
## 1279 2021 EX FT Principal Data Engineer USD 600000 US Remote US L 1M North America North America DATA TRUE ENGINEER FALSE 5 341503.7 258496.29
## 1325 2021 MI FT Applied Machine Learning Scientist USD 423000 US Hybrid US L 1M North America North America ML/AI FALSE SCIENTIST FALSE 20 296616.8 126383.19
## 1328 2020 SE FT Data Scientist USD 412000 US Remote US L 1M North America North America DATA FALSE SCIENTIST FALSE 20 285616.8 126383.19
## 1040 2022 SE FT Applied Data Scientist USD 380000 US Remote US L 1M North America North America DATA FALSE SCIENTIST FALSE 10 228812.2 151187.78
## 1182 2020 SE FT Machine Learning Scientist USD 260000 JP Office JP S 1M Asia Asia ML/AI FALSE SCIENTIST FALSE 13 204334.7 55665.33
## 123 2022 SE FT Data Architect USD 345600 US Office US M 1M North America North America DATA FALSE ARCHITECT FALSE 19 193788.5 151811.50
## 1188 2021 SE FT Machine Learning Engineer USD 256000 US Remote US S 1M North America North America ML/AI FALSE ENGINEER FALSE 6 180524.2 75475.76
## 981 2022 EX FT Data Engineer USD 324000 US Remote US M 1M North America North America DATA FALSE ENGINEER FALSE 10 172812.2 151187.78
model <- lm(salary_in_usd ~ experience_level + employment_type + employee_region + company_size + industry + role + boss + research + remote_ratio, data = df_Q7)
summary(model)
##
## Call:
## lm(formula = salary_in_usd ~ experience_level + employment_type +
## employee_region + company_size + industry + role + boss +
## research + remote_ratio, data = df_Q7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -144040 -27814 -4622 24566 359318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 107605.8 18420.5 5.842 6.52e-09 ***
## experience_levelEX 91027.7 9256.9 9.834 < 2e-16 ***
## experience_levelMI 21839.8 5033.8 4.339 1.54e-05 ***
## experience_levelSE 44618.7 5042.9 8.848 < 2e-16 ***
## employment_typeFL -58306.6 27880.7 -2.091 0.03670 *
## employment_typeFT -22824.1 17458.1 -1.307 0.19132
## employment_typePT -18941.1 22108.2 -0.857 0.39174
## employee_regionSouth America -76319.0 9958.8 -7.663 3.51e-14 ***
## employee_regionCentral America -12658.5 17384.3 -0.728 0.46665
## employee_regionEurope -63878.0 4041.1 -15.807 < 2e-16 ***
## employee_regionAsia -78396.7 6321.1 -12.402 < 2e-16 ***
## employee_regionAfrica -52863.3 18875.1 -2.801 0.00517 **
## employee_regionOceania -34816.1 15700.4 -2.218 0.02676 *
## company_sizeM -4965.7 3461.9 -1.434 0.15170
## company_sizeS -12079.2 5508.6 -2.193 0.02850 *
## industryML/AI 12211.7 4480.8 2.725 0.00651 **
## industryOTHER 223.2 6380.2 0.035 0.97210
## roleARCHITECT 41632.7 8364.9 4.977 7.32e-07 ***
## roleDEVELOPER 21884.2 13437.0 1.629 0.10363
## roleENGINEER 27845.9 4039.5 6.893 8.46e-12 ***
## roleMANAGER 41158.1 7674.3 5.363 9.67e-08 ***
## roleOTHER 14733.2 9380.2 1.571 0.11650
## roleSCIENTIST 32090.9 4019.9 7.983 3.11e-15 ***
## bossTRUE 38375.5 7798.9 4.921 9.73e-07 ***
## researchTRUE 6031.9 11500.5 0.524 0.60003
## remote_ratioHybrid -9612.7 5362.0 -1.793 0.07324 .
## remote_ratioRemote -1348.6 2994.1 -0.450 0.65249
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48350 on 1305 degrees of freedom
## Multiple R-squared: 0.4728, Adjusted R-squared: 0.4623
## F-statistic: 45.02 on 26 and 1305 DF, p-value: < 2.2e-16
df_Q7$predicted_salary_lm <- predict(model, newdata = df_Q7)
df_Q7$salary_difference_lm <- df_Q7$salary_in_usd - df_Q7$predicted_salary_lm
# Use the salary difference column to identify underpaid and overpaid employees
underpaid_employees_lm <- df_Q7[df_Q7$salary_difference_lm < 0, ]
overpaid_employees_lm <- df_Q7[df_Q7$salary_difference_lm > 0, ]
underpaid_employees_lm <- underpaid_employees_lm[order(underpaid_employees_lm$salary_difference_lm), ]
overpaid_employees_lm <- overpaid_employees_lm[order(overpaid_employees_lm$salary_difference_lm, decreasing = TRUE), ]
# View the underpaid and overpaid employees
head(underpaid_employees_lm, n = 10)
## work_year experience_level employment_type job_title salary_currency salary_in_usd employee_residence remote_ratio company_location company_size salary_group employee_region company_region industry boss role research cluster salary_difference_kmodes predicted_salary_kmodes predicted_salary_lm salary_difference_lm
## 113 2021 SE FT 3D Computer Vision Researcher USD 5000 US Remote US L 25k North America North America OTHER FALSE OTHER TRUE 10 -146187.78 151187.78 149040.1 -144040.1
## 958 2022 SE FT Data Engineer USD 25000 US Remote US M 25k North America North America DATA FALSE ENGINEER FALSE 19 -126811.50 151811.50 150931.9 -125931.9
## 1244 2021 MI FT Data Scientist MXN 2859 MX Office MX S 25k North America North America DATA FALSE SCIENTIST FALSE 13 -52806.33 55665.33 126633.1 -123774.1
## 159 2022 SE FT Data Scientist USD 38000 US Remote US M 50k North America North America DATA FALSE SCIENTIST FALSE 14 -110237.54 148237.54 155176.9 -117176.9
## 163 2022 SE FT Data Scientist USD 38000 US Remote US M 50k North America North America DATA FALSE SCIENTIST FALSE 14 -110237.54 148237.54 155176.9 -117176.9
## 173 2022 SE FT Data Scientist USD 38000 US Remote US M 50k North America North America DATA FALSE SCIENTIST FALSE 14 -110237.54 148237.54 155176.9 -117176.9
## 754 2022 EX FT Lead Data Engineer CAD 116073 CA Remote CA S 125k North America North America DATA TRUE ENGINEER FALSE 5 -142423.29 258496.29 228602.9 -112529.9
## 1167 2020 SE FT Data Engineer MXN 33511 MX Office MX S 50k North America North America DATA FALSE ENGINEER FALSE 19 -118300.50 151811.50 145167.1 -111656.1
## 114 2022 MI FT Data Scientist USD 30000 MX Remote MX L 50k North America North America DATA FALSE SCIENTIST FALSE 20 -96383.19 126383.19 137363.7 -107363.7
## 612 2022 MI FT Data Engineer USD 24000 US Office US M 25k North America North America DATA FALSE ENGINEER FALSE 19 -127811.50 151811.50 129501.6 -105501.6
head(overpaid_employees_lm, n=10)
## work_year experience_level employment_type job_title salary_currency salary_in_usd employee_residence remote_ratio company_location company_size salary_group employee_region company_region industry boss role research cluster salary_difference_kmodes predicted_salary_kmodes predicted_salary_lm salary_difference_lm
## 1279 2021 EX FT Principal Data Engineer USD 600000 US Remote US L 1M North America North America DATA TRUE ENGINEER FALSE 5 341503.7 258496.29 240682.16 359317.8
## 1151 2021 MI FT Financial Data Analyst USD 450000 US Remote US L 1M North America North America DATA FALSE ANALYST FALSE 16 349633.1 100366.87 105272.80 344727.2
## 1094 2020 MI FT Research Scientist USD 450000 US Office US M 1M North America North America OTHER FALSE SCIENTIST TRUE 2 360008.3 89991.67 140001.66 309998.3
## 1325 2021 MI FT Applied Machine Learning Scientist USD 423000 US Hybrid US L 1M North America North America ML/AI FALSE SCIENTIST FALSE 20 296616.8 126383.19 141311.35 281688.7
## 1328 2020 SE FT Data Scientist USD 412000 US Remote US L 1M North America North America DATA FALSE SCIENTIST FALSE 20 285616.8 126383.19 160142.65 251857.3
## 1035 2022 SE FT Data Analytics Lead USD 405000 US Remote US L 1M North America North America DATA TRUE OTHER FALSE 5 146503.7 258496.29 181160.50 223839.5
## 1040 2022 SE FT Applied Data Scientist USD 380000 US Remote US L 1M North America North America DATA FALSE SCIENTIST FALSE 10 228812.2 151187.78 160142.65 219857.3
## 123 2022 SE FT Data Architect USD 345600 US Office US M 1M North America North America DATA FALSE ARCHITECT FALSE 19 193788.5 151811.50 166067.29 179532.7
## 1182 2020 SE FT Machine Learning Scientist USD 260000 JP Office JP S 1M Asia Asia ML/AI FALSE SCIENTIST FALSE 13 204334.7 55665.33 83227.02 176773.0
## 1250 2021 EX CT Principal Data Scientist USD 416000 US Remote US S 1M North America North America DATA TRUE SCIENTIST FALSE 5 157503.7 258496.29 255672.04 160328.0
df_Q7$kmodes_ratio_error <- ((df_Q7$predicted_salary_kmodes / df_Q7$salary_in_usd)-1)*100
df_Q7$lm_ratio_error <- ((df_Q7$predicted_salary_lm / df_Q7$salary_in_usd)-1)*100
summary(df_Q7[, c("kmodes_ratio_error","lm_ratio_error")])
## kmodes_ratio_error lm_ratio_error
## Min. : -80.002 Min. :-193.78
## 1st Qu.: -19.918 1st Qu.: -17.07
## Median : 4.267 Median : 4.74
## Mean : 49.975 Mean : 33.21
## 3rd Qu.: 46.444 3rd Qu.: 32.40
## Max. :5105.239 Max. :4329.28
quantiles_kmodes <- quantile(df_Q7$kmodes_ratio_error, c(0.25,0.75))
quantiles_lm <- quantile(df_Q7$lm_ratio_error, c(0.25,0.75))
df_Q7 <- df_Q7[order(df_Q7$salary_in_usd),]
# Add a column to the dataframe with a sequence of integers
x <- 1:nrow(df_Q7)
df_Q7$status_lm <- ifelse(df_Q7$lm_ratio_error < quantiles_kmodes[1], "Overpaid",
ifelse(df_Q7$lm_ratio_error < quantiles_kmodes[2], "Normal", "Underpaid"))
df_Q7$status_kmodes <- ifelse(df_Q7$kmodes_ratio_error < quantiles_lm[1], "Overpaid",
ifelse(df_Q7$kmodes_ratio_error < quantiles_lm[2], "Normal", "Underpaid"))
ggplot(df_Q7, aes(x = x, y = salary_in_usd)) +
geom_point(aes(y = salary_in_usd), size= 2, shape = 20) +
geom_point(aes(y = predicted_salary_lm, color = status_lm), size =2, shape = 20, alpha = 0.9) +
scale_color_manual(
values = c("Overpaid" = "red", "Underpaid" = "orange", "Normal" = "green"),
labels = c('Normal', 'Overpaid', 'Underpaid'),
aesthetics = c("colour", "fill"),
name = "Consideration") +
ggtitle("Plot of salary and predicted salaries using Linear Regression") +
ylab("Salary in USD") +
scale_y_continuous(labels = label_comma(), breaks = scales::pretty_breaks(n = 10))
ggplot(df_Q7, aes(x = x, y = salary_in_usd)) +
geom_point(aes(y = salary_in_usd), size= 2, shape = 20) +
geom_point(aes(y = predicted_salary_kmodes, color = status_kmodes), size =2, shape = 20, alpha = 0.9) +
scale_color_manual(
values = c("Overpaid" = "red", "Underpaid" = "orange", "Normal" = "green"),
labels = c('Normal', 'Overpaid', 'Underpaid'),
aesthetics = c("colour", "fill"),
name = "Consideration") +
ggtitle("Plot of salary and predicted salaries using Clustering") +
ylab("Salary in USD") +
labs(fill="Consideration") +
scale_y_continuous(labels = label_comma(), breaks = scales::pretty_breaks(n = 10))
df_Q7$status_lm <- as.factor(df_Q7$status_lm)
df_Q7$status_kmodes <- as.factor(df_Q7$status_kmodes)
ct <- table(df_Q7$status_lm,df_Q7$status_kmodes)
ct <- ct[c(3,1,2),c(3,1,2)]
ct <- melt(ct)
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
colnames(ct) <- c("LinearRegression", "Clustering", "Count")
ggplot(ct, aes(x = LinearRegression, y = Clustering, fill = Count)) +
geom_tile() +
scale_fill_gradient2(low = "#075AFF", mid = "#FFFFCC", high = "#FF0000") +
geom_text(aes(label = Count), color = "black", size = 4) +
coord_fixed() +
ggtitle("Comparison between Clustering and Linear Regression")
underpaid_lm <- df_Q7[df_Q7$status_lm=="Underpaid",]
overpaid_lm <- df_Q7[df_Q7$status_lm=="Overpaid",]
underpaid_kmodes <- df_Q7[df_Q7$status_kmodes =="Underpaid",]
overpaid_kmodes <- df_Q7[df_Q7$status_kmodes =="Overpaid",]
variables <- c("experience_level", "remote_ratio", "salary_group", "employee_region", "company_region", "role", "boss", "research", "company_size", "industry")
for (i in 1:length(variables)){
ratio_df <- table(df_Q7[, c(variables[i])])/nrow(df_Q7)
ratio_up_lm <- table(underpaid_lm[, c(variables[i])])/nrow(underpaid_lm)
ratio_op_lm <- table(overpaid_lm[, c(variables[i])])/nrow(overpaid_lm)
# plot them in such a way that they can be compared
df_long_lm <- melt(data.frame(ratio_up_lm, ratio_df, ratio_op_lm))
ratio_up_kmodes <- table(underpaid_kmodes[, c(variables[i])])/nrow(underpaid_kmodes)
ratio_op_kmodes <- table(overpaid_kmodes[, c(variables[i])])/nrow(overpaid_kmodes)
# plot them in such a way that they can be compared
df_long_kmodes <- melt(data.frame(ratio_up_kmodes, ratio_df, ratio_op_kmodes))
print(ggplot(data = df_long_lm, aes(x = variable, y = value , fill = Var1, label = round(value, 2))) +
geom_bar(stat = "identity", position = "fill") +
geom_text(size = 3, position = position_stack(vjust = 0.5)) +
scale_x_discrete(labels = c("Freq"= "Underpaid", "Freq.1"= "Normal", "Freq.2" = "Overpaid")) +
labs(fill=variables[i], title=paste("Stacked percent bar chart of ",variables[i], " (LM)"), x="Consideration", y = "%"))
print(ggplot(data = df_long_kmodes, aes(x = variable, y = value , fill = Var1, label = round(value, 2))) +
geom_bar(stat = "identity", position = "fill") +
geom_text(size = 3, position = position_stack(vjust = 0.5)) +
scale_x_discrete(labels = c("Freq"= "Underpaid", "Freq.1"= "Normal", "Freq.2" = "Overpaid")) +
labs(fill=variables[i], title=paste("Stacked percent bar chart of ",variables[i], " (Clustering)"), x="Consideration", y = "%"))
}
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables
Can we predict an employee’s experience level based on their salary and job title? Using classification algorithms, we could build a model that predicts an employee’s experience level based on their salary and job title. This could be useful for employees and job seekers who want to understand what experience level they can expect to be paid for a given salary and job title. We could also include other factors, such as location and company size, to improve the accuracy of the model.
For this we’ll use a Decision Tree since we are interested in having a clear way to visualize and understand the result. We want to classify the experience level of employees based on attributes such as their role, industry for which they work, salary, etcetera.
BASE MODEL: Predict experience level with only salary and job title; since job title is a factor with numerous levels industry and role will be used instead.
df14 = df[, c("industry", "role", "salary_in_usd", "experience_level")]
dt14 = rpart(experience_level~., data=df14)
rpart.plot(dt14)
ADVANCED MODEL: Predict experience level with all of the variables available (apart from those which factor level exceeds 32).
df14v2 = df[ , -which(names(df) %in% c("job_title", "work_year", "ID", "company_location", "employee_residence", "employee_region", "company_region", "salary_group", "salary_currency", "remote_ratio"))]
summary(df14v2)
## experience_level employment_type salary_in_usd company_size industry boss role research
## EN:147 CT: 8 Min. : 2324 L:322 DATA :1076 Mode :logical ANALYST :232 Mode :logical
## EX: 45 FL: 5 1st Qu.: 75593 M:898 ML/AI: 157 FALSE:1276 ARCHITECT: 40 FALSE:1306
## MI:337 FT:1306 Median :120000 S:112 OTHER: 99 TRUE :56 DEVELOPER: 16 TRUE :26
## SE:803 PT: 13 Mean :123375 ENGINEER :527
## 3rd Qu.:164997 MANAGER : 50
## Max. :600000 OTHER : 41
## SCIENTIST:426
dt14v2 = rpart(experience_level~., data=df14v2)
rpart.plot(dt14v2)
set.seed(123)
df_no_s <- df
df_no_s$salary_group <- NULL
df_no_s$salary_currency <- NULL
df_no_s$salary_in_usd <- NULL
train_index <- createDataPartition(df_no_s$experience_level, p = 0.7, list = FALSE)
train_data <- df_no_s[train_index, ]
test_data <- df_no_s[-train_index, ]
model <- train(experience_level ~ ., data = train_data, method = "knn", tuneGrid = expand.grid(k = 1:20))
print(model$bestTune$k)
## [1] 1
summary(model)
## Length Class Mode
## learn 2 -none- list
## k 1 -none- numeric
## theDots 0 -none- list
## xNames 214 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 4 -none- character
## param 0 -none- list
importance <- varImp(model)
plot(importance)
predictions <- predict(model, test_data)
cMatrix <- confusionMatrix(predictions, test_data$experience_level)
cMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction EN EX MI SE
## EN 11 0 11 5
## EX 0 3 1 2
## MI 16 3 51 15
## SE 17 7 38 218
##
## Overall Statistics
##
## Accuracy : 0.7111
## 95% CI : (0.6638, 0.7551)
## No Information Rate : 0.603
## P-Value [Acc > NIR] : 4.532e-06
##
## Kappa : 0.4374
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: EN Class: EX Class: MI Class: SE
## Sensitivity 0.25000 0.230769 0.5050 0.9083
## Specificity 0.95480 0.992208 0.8855 0.6076
## Pos Pred Value 0.40741 0.500000 0.6000 0.7786
## Neg Pred Value 0.91105 0.974490 0.8403 0.8136
## Prevalence 0.11055 0.032663 0.2538 0.6030
## Detection Rate 0.02764 0.007538 0.1281 0.5477
## Detection Prevalence 0.06784 0.015075 0.2136 0.7035
## Balanced Accuracy 0.60240 0.611489 0.6952 0.7580
cMatrix$byClass[1]
## [1] 0.25
This could be useful for employees that are going to recieve a promotion, and want to know how much they should get paid for their new position. It could also be useful for employers that want to know how much they should offer new hires or promoting employees.
f <- ggplot(df, aes(x=reorder(experience_level,salary_in_usd), y=salary_in_usd))+
geom_boxplot()+
ggtitle("Salary Distribution for each experience level") + ylab("Salary (USD)") + xlab("")
f
As we can see, from ENtry level to EXecutive level, as the experience
increases, the larger the average salary gets. Lets see if we can find,
using clustering, 3 zones for each
custom_line_plot <- function(datos,avgs, title){
plot1 <- ggplot()+
geom_violin(data=datos, aes(x=reorder(experience_level,salary_in_usd), y = salary_in_usd))+
geom_line(data=avgs,aes(x=Experience_Level, y=Average_Salary, group=1))+
geom_point(data=avgs,aes(x=Experience_Level, y=Average_Salary, group=1))+
geom_text(data=avgs,aes(x=Experience_Level, y=Average_Salary, label = round(Average_Salary,0)) ,hjust=-0.2, vjust=-1)+
scale_y_continuous(breaks = scales::pretty_breaks(n = 15))+
labs(title=title)
plot1
}
custom_line_plot(df_data,data_avg,"Data Industry")
custom_line_plot(df_ai,ai_avg,"AI/ML Industry")
custom_line_plot(df_other, other_avg,"Other Industries")
get_increases <- function(dframe){
dframe$increase <- 0
dframe$increase[dframe$Experience_Level=="MI"] <- dframe$Average_Salary[dframe$Experience_Level=="MI"] - dframe$Average_Salary[dframe$Experience_Level=="EN"]
dframe$increase[dframe$Experience_Level=="SE"] <- dframe$Average_Salary[dframe$Experience_Level=="SE"] - dframe$Average_Salary[dframe$Experience_Level=="MI"]
dframe$increase[dframe$Experience_Level=="EX"] <- dframe$Average_Salary[dframe$Experience_Level=="EX"] - dframe$Average_Salary[dframe$Experience_Level=="SE"]
dframe$increasePercentage <- 0
dframe$increasePercentage[dframe$Experience_Level=="MI"] <- dframe$Average_Salary[dframe$Experience_Level=="MI"] / dframe$Average_Salary[dframe$Experience_Level=="EN"]
dframe$increasePercentage[dframe$Experience_Level=="SE"] <- dframe$Average_Salary[dframe$Experience_Level=="SE"] / dframe$Average_Salary[dframe$Experience_Level=="MI"]
dframe$increasePercentage[dframe$Experience_Level=="EX"] <- dframe$Average_Salary[dframe$Experience_Level=="EX"] / dframe$Average_Salary[dframe$Experience_Level=="SE"]
dframe$increasePercentage <- ifelse(dframe$Experience_Level=="EN",0,(dframe$increasePercentage - 1) * 100)
return(dframe)
}
data_avg <- get_increases(data_avg)
ai_avg <- get_increases(ai_avg)
other_avg <- get_increases(other_avg)
data_avg
## Experience_Level Average_Salary increase increasePercentage
## 1 EN 57413.78 0.00 0.00000
## 2 MI 86994.15 29580.37 51.52138
## 3 SE 144961.30 57967.15 66.63339
## 4 EX 205371.24 60409.94 41.67315
ai_avg
## Experience_Level Average_Salary increase increasePercentage
## 1 EN 65501.3 0.00 0.000000
## 2 MI 108864.9 43363.55 66.202585
## 3 SE 151329.1 42464.23 39.006374
## 4 EX 164241.8 12912.66 8.532837
other_avg
## Experience_Level Average_Salary increase increasePercentage
## 1 EN 68638.79 0.00 0.00000
## 2 MI 94903.37 26264.58 38.26493
## 3 SE 133025.39 38122.02 40.16930
## 4 EX 158747.00 25721.61 19.33586
First we should have a look at how much are employees paid while working in the office at their same region.
df_filtered <- filter(df, employee_region == company_region)
print(ggplot(aes(x = employee_region, y = salary_in_usd, fill=employee_region), data = df_filtered) +
stat_summary(fun=mean, geom="bar", position = "stack") +
stat_summary(aes(label=round(after_stat(y),2)), fun.y=mean, geom="text", size=3, vjust = -0.5) +
xlab("Region") + labs(title="Salary of any work in the same Region"))
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
Here we can see that most of the Regions are underpaid for office work compared to North America, especially the rest of america.
Could they earn more if they work remotely for another Region?
Lets have a look at how much do companies in each region pay for Remote work.
df_filtered <- filter(df, employee_region != company_region & remote_ratio == "Remote")
print(ggplot(aes(x = company_region, y = salary_in_usd, fill=company_region), data = df_filtered) +
stat_summary(fun=mean, geom="bar", position = "stack") +
stat_summary(aes(label=round(after_stat(y),2)), fun.y=mean, geom="text", size=3, vjust = -0.5) +
xlab("Region") + labs(title="Salary of Remote work in another Region"))
We can see that by average North America and Europe are close, while the
rest are much lower, especially Asia. The regions that do not appear,
don’t employ remote workers from another regions.
In relation to the high paying, we can see that USA pays less for remote work, than for office work. On the other hand in Europe the salary is on the same level for remote and office work.
Now we can focus on answering the question.
aux <- data.frame(matrix(0, nrow = length(levels(df$company_region)), ncol = length(levels(df$company_region))))
colnames(aux) <- levels(df$company_region)
rownames(aux) <- levels(df$company_region)
for (i in 1:nrow(aux)) {
max = 0
for (j in 1:ncol(aux)) {
df_filtered <- filter(df, company_region == colnames(aux)[j] & employee_region == rownames(aux)[i])
aux[i, j] <- mean(df_filtered$salary_in_usd)
}
}
aux_clean <- data.frame(matrix(0, nrow = length(levels(df$company_region)), ncol = length(levels(df$company_region))))
colnames(aux_clean) <- levels(df$company_region)
rownames(aux_clean) <- levels(df$company_region)
aux <- replace(aux, is.na(aux), 0)
for (i in 1:nrow(aux_clean)) {
max <- 0
for (j in 1:ncol(aux_clean)) {
#if(i == j) {
# aux_clean[i,j] <- aux[i,j]
#} else
if(aux[i,j]>max){
max <- aux[i,j]
aux_clean[i,j] <- aux[i,j]
}
}
}
data_long <- aux_clean %>%
rownames_to_column %>%
gather(key = 'key', value = 'value', -rowname) %>%
filter(value > 0)
colnames(data_long) <- c("source", "target", "value")
data_long$target <- paste(data_long$target, " ", sep="")
nodes <- data.frame(name=c(as.character(data_long$source), as.character(data_long$target)) %>% unique())
data_long$IDsource=match(data_long$source, nodes$name)-1
data_long$IDtarget=match(data_long$target, nodes$name)-1
sankeyNetwork(Links = data_long, Nodes = nodes,
Source = "IDsource", Target = "IDtarget",
Value = "value", NodeID = "name",
sinksRight=FALSE, nodeWidth=40, fontSize=13, nodePadding=20)
For this we’ll use visualization techniques with the ggplot library. The width of the boxes depends on the amount of samples each class/value has so we can extract better conclusions out of them.
ggplot(df, aes(x = as.factor(work_year), y = salary_in_usd)) +
geom_boxplot(varwidth=T, fill="lightblue") +
labs(title = "Salary depending on the work year", x = "Year", y = "Salary in USD") +
theme_minimal() +
scale_y_continuous(labels = scales::dollar_format())
ggplot(df, aes(x = experience_level, y = salary_in_usd)) +
geom_boxplot(varwidth=T, fill="lightblue") +
labs(title = "Salary depending on the experience level", x = "Experience level", y = "Salary in USD") +
theme_minimal() +
scale_y_continuous(labels = scales::dollar_format())
ggplot(df, aes(x = employment_type, y = salary_in_usd)) +
geom_boxplot(varwidth=T, fill="lightblue") +
labs(title = "Salary depending on the employment type", x = "Employment type", y = "Salary in USD") +
theme_minimal() +
scale_y_continuous(labels = scales::dollar_format())
ggplot(df, aes(x = employee_region, y = salary_in_usd)) +
geom_boxplot(varwidth=T, fill="lightblue") +
labs(title = "Salary depending on the employee region of residence", x = "Region of residence", y = "Salary in USD") +
theme_minimal() +
scale_y_continuous(labels = scales::dollar_format())
ggplot(df, aes(x = company_size, y = salary_in_usd)) +
geom_boxplot(varwidth=T, fill="lightblue") +
labs(title = "Salary depending on the company size", x = "Company size", y = "Salary in USD") +
theme_minimal() +
scale_y_continuous(labels = scales::dollar_format())
ggplot(df, aes(x = role, y = salary_in_usd)) +
geom_boxplot(varwidth=T, fill="lightblue") +
labs(title = "Salary depending on the role", x = "Role", y = "Salary in USD") +
theme_minimal() +
scale_y_continuous(labels = scales::dollar_format())
ggplot(df, aes(x = industry, y = salary_in_usd)) +
geom_boxplot(varwidth=T, fill="lightblue") +
labs(title = "Salary depending on the industry", x = "Industry", y = "Salary in USD") +
theme_minimal() +
scale_y_continuous(labels = scales::dollar_format())
ggplot(df, aes(x = boss, y = salary_in_usd)) +
geom_boxplot(varwidth=T, fill="lightblue") +
labs(title = "Salary depending on whether the employee has a leadership position", x = "Has a leadership position", y = "Salary in USD") +
theme_minimal() +
scale_y_continuous(labels = scales::dollar_format())
ggplot(df, aes(x = research, y = salary_in_usd)) +
geom_boxplot(varwidth=T, fill="lightblue") +
labs(title = "Salary depending on whether the employee participates in research projects", x = "Works in research projects", y = "Salary in USD") +
theme_minimal() +
scale_y_continuous(labels = scales::dollar_format())
Let’s go through the different plots and check if we can identify any disparity:
increase = (mean(df[df$work_year == 2022, 'salary_in_usd'])/mean(df[df$work_year == 2021, 'salary_in_usd'])) - 1
net_increase = increase - 0.077 # official inflation rate in 2022 in the US
net_increase
## [1] 0.2717521
It seems like the increase in salaries between 2021 and 2022 is not explained only with the inflation, since the increase has been of over a 34% in total and the official 2022 inflation rate that the US government gives is ay 7.7%. The net salary increase taking that into account (most samples of the data are US based), net salaries have increased a 27.17%.
We only want the country and the cost of living and we delete Kosovo from the list since it is a disputed territory and luckily enough we don’t have any Kosovo related data in our dataset
cost_of_living_df = read.csv("Cost_of_Living_Index_2022.csv", stringsAsFactors = TRUE)[-67, c(2,3)]
cost_of_living_df = setNames(cost_of_living_df, c("country", "cost_of_living_index"))
cost_of_living_df$country = countrycode(cost_of_living_df$country, origin = 'country.name', destination = 'iso2c')
Add the cost of living index columns of the employee residence and the company location deleting those rows that contain either American Samoa or Aaland Islands, since we don’t have information about the cost of living index in these countries.
dfN<-subset(df, employee_residence!="AX" & employee_residence!="AS" & company_location!="AX" & company_location!="AS")
dfN$cost_of_living_index_employee = rep(0, nrow(dfN))
dfN$cost_of_living_index_company = rep(0, nrow(dfN))
for(i in 1:nrow(dfN)) {
country_employee = dfN[i, ]$employee_residence
country_company = dfN[i, ]$company_location
index_employee = cost_of_living_df[as.character(cost_of_living_df$country)==as.character(country_employee), ]$cost_of_living_index
index_company = cost_of_living_df[as.character(cost_of_living_df$country)==as.character(country_company), ]$cost_of_living_index
dfN[i,]$cost_of_living_index_employee = index_employee
dfN[i,]$cost_of_living_index_company = index_company
}
Let’s find the employees that live in a cheaper country than the one that they work for.
subset(dfN, cost_of_living_index_employee>cost_of_living_index_company)
## work_year experience_level employment_type job_title salary_currency salary_in_usd employee_residence remote_ratio company_location company_size salary_group employee_region company_region industry boss role research cost_of_living_index_employee cost_of_living_index_company
## 439 2022 EN FT Data Scientist USD 50000 US Hybrid DE M 50k North America Europe DATA FALSE SCIENTIST FALSE 70.13 65.58
## 440 2022 EN FT Data Analyst INR 6413 FR Remote IN L 25k Europe Asia DATA FALSE ANALYST FALSE 74.13 24.43
## 643 2022 SE FT Applied Machine Learning Scientist EUR 77280 FR Remote GB L 100k Europe Europe ML/AI FALSE SCIENTIST FALSE 74.13 69.65
## 745 2022 MI FT Machine Learning Engineer EUR 84229 FR Remote DE M 100k Europe Europe ML/AI FALSE ENGINEER FALSE 74.13 65.58
## 1015 2022 EN FT Data Scientist USD 40000 JP Remote MY L 50k Asia Asia DATA FALSE SCIENTIST FALSE 77.03 37.02
## 1051 2022 MI FL Data Scientist USD 100000 CA Remote US M 100k North America North America DATA FALSE SCIENTIST FALSE 70.22 70.13
## 1073 2021 MI FT Big Data Engineer USD 60000 ES Hybrid RO M 75k Europe Europe DATA FALSE ENGINEER FALSE 53.88 35.24
## 1113 2021 EX FT Director of Data Science EUR 153667 IT Remote PL L 200k Europe Europe DATA TRUE OTHER FALSE 66.47 38.95
## 1194 2021 EN FT Research Scientist USD 100000 JE Office CN L 100k Europe Asia OTHER FALSE SCIENTIST TRUE 92.02 41.77
## 1202 2021 EN FT Machine Learning Engineer USD 85000 NL Remote DE S 100k Europe Europe ML/AI FALSE ENGINEER FALSE 75.66 65.58
## 1233 2021 EX FT Data Science Consultant EUR 69741 FR Remote ES S 75k Europe Europe DATA FALSE OTHER FALSE 74.13 53.88
## 1268 2021 MI FT Data Engineer GBP 66022 HK Hybrid GB S 75k Asia Europe DATA FALSE ENGINEER FALSE 80.71 69.65
## 1277 2021 SE FT Research Scientist USD 50000 FR Remote US S 50k Europe North America OTHER FALSE SCIENTIST TRUE 74.13 70.13
subset(dfN, as.character(company_location)!=as.character(employee_residence))
## work_year experience_level employment_type job_title salary_currency salary_in_usd employee_residence remote_ratio company_location company_size salary_group employee_region company_region industry boss role research cost_of_living_index_employee cost_of_living_index_company
## 68 2022 EN PT Data Analyst USD 125404 CN Hybrid US S 150k Asia North America DATA FALSE ANALYST FALSE 41.77 70.13
## 157 2022 SE FT Product Data Scientist USD 8000 IN Remote SG L 25k Asia Asia DATA FALSE SCIENTIST FALSE 24.43 83.98
## 271 2022 EN FT Data Analyst USD 20000 CR Hybrid US M 25k Central America North America DATA FALSE ANALYST FALSE 47.01 70.13
## 315 2022 EN FT Data Specialist USD 105000 CL Remote US L 125k South America North America DATA FALSE OTHER FALSE 43.90 70.13
## 357 2022 EN FT AI Scientist EUR 31586 PT Remote ES M 50k Europe Europe ML/AI FALSE SCIENTIST FALSE 47.94 53.88
## 373 2022 MI CT NLP Engineer USD 60000 IN Remote US S 75k Asia North America OTHER FALSE ENGINEER FALSE 24.43 70.13
## 403 2022 SE FT Data Engineering Manager USD 193000 ES Remote US M 200k Europe North America DATA FALSE MANAGER FALSE 53.88 70.13
## 439 2022 EN FT Data Scientist USD 50000 US Hybrid DE M 50k North America Europe DATA FALSE SCIENTIST FALSE 70.13 65.58
## 440 2022 EN FT Data Analyst INR 6413 FR Remote IN L 25k Europe Asia DATA FALSE ANALYST FALSE 74.13 24.43
## 500 2022 MI FT Applied Machine Learning Scientist USD 75000 BO Remote US M 75k South America North America ML/AI FALSE SCIENTIST FALSE 34.77 70.13
## 538 2022 EN PT Data Scientist USD 110000 DO Remote FR M 125k Central America Europe DATA FALSE SCIENTIST FALSE 41.77 74.13
## 631 2022 MI FT Data Scientist INR 53877 IN Remote ID L 75k Asia Asia DATA FALSE SCIENTIST FALSE 24.43 35.85
## 636 2022 SE FT Analytics Engineer USD 48000 AR Remote US S 50k South America North America OTHER FALSE ENGINEER FALSE 34.69 70.13
## 643 2022 SE FT Applied Machine Learning Scientist EUR 77280 FR Remote GB L 100k Europe Europe ML/AI FALSE SCIENTIST FALSE 74.13 69.65
## 726 2022 MI FT AI Scientist USD 200000 IN Remote US L 200k Asia North America ML/AI FALSE SCIENTIST FALSE 24.43 70.13
## 745 2022 MI FT Machine Learning Engineer EUR 84229 FR Remote DE M 100k Europe Europe ML/AI FALSE ENGINEER FALSE 74.13 65.58
## 796 2022 SE FT Data Science Engineer USD 60000 AR Remote MX L 75k South America North America DATA FALSE ENGINEER FALSE 34.69 35.35
## 999 2022 MI FT Applied Machine Learning Scientist USD 75000 BO Remote US L 75k South America North America ML/AI FALSE SCIENTIST FALSE 34.77 70.13
## 1006 2022 EN FT Computer Vision Engineer USD 10000 PT Remote LU M 25k Europe Europe OTHER FALSE ENGINEER FALSE 47.94 80.50
## 1009 2022 SE FT Head of Data USD 200000 MY Remote US M 200k Asia North America DATA TRUE OTHER FALSE 37.02 70.13
## 1015 2022 EN FT Data Scientist USD 40000 JP Remote MY L 50k Asia Asia DATA FALSE SCIENTIST FALSE 77.03 37.02
## 1018 2020 EN FT Data Engineer EUR 54742 PK Remote DE L 75k Asia Europe DATA FALSE ENGINEER FALSE 19.92 65.58
## 1023 2022 SE FT Data Scientist USD 100000 BR Remote US M 100k South America North America DATA FALSE SCIENTIST FALSE 33.24 70.13
## 1028 2022 EN CT Applied Machine Learning Scientist EUR 30533 TN Remote CZ M 50k Africa Europe ML/AI FALSE SCIENTIST FALSE 27.87 48.24
## 1030 2022 EN FT Data Engineer EUR 55591 PK Remote DE M 75k Asia Europe DATA FALSE ENGINEER FALSE 19.92 65.58
## 1044 2022 MI FT Data Scientist USD 48000 RU Remote US S 50k Europe North America DATA FALSE SCIENTIST FALSE 35.26 70.13
## 1051 2022 MI FL Data Scientist USD 100000 CA Remote US M 100k North America North America DATA FALSE SCIENTIST FALSE 70.22 70.13
## 1057 2021 MI FT Applied Machine Learning Scientist USD 38400 VN Remote US M 50k Asia North America ML/AI FALSE SCIENTIST FALSE 37.48 70.13
## 1073 2021 MI FT Big Data Engineer USD 60000 ES Hybrid RO M 75k Europe Europe DATA FALSE ENGINEER FALSE 53.88 35.24
## 1089 2020 SE FT Data Scientist EUR 68428 GR Remote US L 75k Europe North America DATA FALSE SCIENTIST FALSE 56.22 70.13
## 1108 2021 EN PT AI Scientist USD 12000 PK Remote US M 25k Asia North America ML/AI FALSE SCIENTIST FALSE 19.92 70.13
## 1111 2021 MI FT Applied Data Scientist CAD 54238 GB Hybrid CA L 75k Europe North America DATA FALSE SCIENTIST FALSE 69.65 70.22
## 1113 2021 EX FT Director of Data Science EUR 153667 IT Remote PL L 200k Europe Europe DATA TRUE OTHER FALSE 66.47 38.95
## 1119 2021 SE FT Data Analyst USD 80000 BG Remote US S 100k Europe North America DATA FALSE ANALYST FALSE 38.38 70.13
## 1126 2020 MI FT Data Scientist USD 45760 PH Remote US S 50k Asia North America DATA FALSE SCIENTIST FALSE 37.06 70.13
## 1130 2021 EX FT BI Data Analyst USD 150000 IN Remote US L 150k Asia North America DATA FALSE ANALYST FALSE 24.43 70.13
## 1135 2021 SE FT Marketing Data Analyst EUR 88654 GR Remote DK L 100k Europe Europe DATA FALSE ANALYST FALSE 56.22 84.12
## 1146 2021 MI FT BI Data Analyst HUF 36259 HU Hybrid US L 50k Europe North America DATA FALSE ANALYST FALSE 40.66 70.13
## 1149 2021 MI FT Computer Vision Software Engineer EUR 95746 DE Remote US S 100k Europe North America OTHER FALSE ENGINEER FALSE 65.58 70.13
## 1157 2021 SE FT Cloud Data Engineer USD 160000 BR Remote US S 200k South America North America DATA FALSE ENGINEER FALSE 33.24 70.13
## 1194 2021 EN FT Research Scientist USD 100000 JE Office CN L 100k Europe Asia OTHER FALSE SCIENTIST TRUE 92.02 41.77
## 1202 2021 EN FT Machine Learning Engineer USD 85000 NL Remote DE S 100k Europe Europe ML/AI FALSE ENGINEER FALSE 75.66 65.58
## 1210 2021 SE FT Lead Data Engineer USD 160000 PR Hybrid US S 200k Central America North America DATA TRUE ENGINEER FALSE 66.53 70.13
## 1211 2021 MI FT Data Scientist EUR 25532 RS Remote DE S 50k Europe Europe DATA FALSE SCIENTIST FALSE 36.21 65.58
## 1222 2021 SE FT Data Science Manager INR 54094 IN Hybrid US L 75k Asia North America DATA FALSE MANAGER FALSE 24.43 70.13
## 1233 2021 EX FT Data Science Consultant EUR 69741 FR Remote ES S 75k Europe Europe DATA FALSE OTHER FALSE 74.13 53.88
## 1234 2021 SE FT Data Analytics Engineer USD 50000 VN Remote GB M 50k Asia Europe DATA FALSE ENGINEER FALSE 37.48 69.65
## 1238 2021 MI FT Data Engineer EUR 26005 RO Office US L 50k Europe North America DATA FALSE ENGINEER FALSE 35.24 70.13
## 1241 2021 MI FT Data Scientist INR 5679 IN Remote US S 25k Asia North America DATA FALSE SCIENTIST FALSE 24.43 70.13
## 1246 2021 MI FT Data Scientist EUR 61467 DE Hybrid AT M 75k Europe Europe DATA FALSE SCIENTIST FALSE 65.58 71.04
## 1251 2021 SE FT Machine Learning Scientist USD 225000 US Remote CA L 250k North America North America ML/AI FALSE SCIENTIST FALSE 70.13 70.22
## 1258 2020 SE FT Big Data Engineer EUR 114047 PL Remote GB S 125k Europe Europe DATA FALSE ENGINEER FALSE 38.95 69.65
## 1261 2020 MI FT Lead Data Engineer USD 56000 PT Remote US M 75k Europe North America DATA TRUE ENGINEER FALSE 47.94 70.13
## 1268 2021 MI FT Data Engineer GBP 66022 HK Hybrid GB S 75k Asia Europe DATA FALSE ENGINEER FALSE 80.71 69.65
## 1272 2021 MI FL Data Engineer USD 20000 IT Office US L 25k Europe North America DATA FALSE ENGINEER FALSE 66.47 70.13
## 1277 2021 SE FT Research Scientist USD 50000 FR Remote US S 50k Europe North America OTHER FALSE SCIENTIST TRUE 74.13 70.13
## 1282 2021 EN FT Big Data Engineer INR 5882 IN Office CH L 25k Asia Europe DATA FALSE ENGINEER FALSE 24.43 123.35
## 1294 2021 SE FT Data Engineer EUR 76833 RO Hybrid GB S 100k Europe Europe DATA FALSE ENGINEER FALSE 35.24 69.65
## 1296 2021 SE FT Data Science Manager USD 152000 US Remote FR L 200k North America Europe DATA FALSE MANAGER FALSE 70.13 74.13
## 1298 2020 SE FL Computer Vision Engineer USD 60000 RU Remote US S 75k Europe North America OTHER FALSE ENGINEER FALSE 35.26 70.13
## 1309 2020 MI FT Data Scientist EUR 62726 FR Hybrid LU S 75k Europe Europe DATA FALSE SCIENTIST FALSE 74.13 80.50
## 1313 2021 EN PT AI Scientist USD 12000 BR Remote US S 25k South America North America ML/AI FALSE SCIENTIST FALSE 33.24 70.13
## 1321 2020 MI FT Data Engineer USD 130800 ES Remote US M 150k Europe North America DATA FALSE ENGINEER FALSE 53.88 70.13
## 1324 2021 MI FT Data Scientist SGD 119059 SG Remote IL M 125k Asia Asia DATA FALSE SCIENTIST FALSE 83.98 88.05